#!/usr/bin/env python
#-*- coding: utf-8 -*-


import signal
import sys
import argparse
import re
import json

from mandja.crawler import Crawler
from mandja.parser import UrlParser
from mandja.session import Session


def get_args():
    parser = argparse.ArgumentParser(description=('A little recursive crawler. '
                                                  'It is able to list all URLs it has visited. '
                                                  'You reuse the connection via Keep-Connection when '
                                                  'crawling on the set domain and use different connections '
                                                  'for the head ping of URLs on different domains.'),
                                     epilog='Copyright (C) 2011 MrGray')
    parser.add_argument("url",
                        help="URL to crawl")
    parser.add_argument("-a", "--httpauth-creds",
                        help=("HTTP authentication credentials passed as a json string: "
                              "-a '{\"<netloc>\": {\"username\": \"<user>\", \"password\": \"<password>\"}}'"))
    parser.add_argument("-s", "--session-file",
                        help=("If a session file is used - all the data from URL processing is saved "
                              "including the current state. If the crawling is interrupted you can "
                              "continue later using the session file. When crawleing has finished, "
                              "this state is marked also. By using the file you can filter/parse/extract data "
                              "related to the crawled URL offline."))
    parser.add_argument("-g", "--go-regex",
                        default="^.+$",
                        help=("Pre-visit regex filter to determine if an URL is permitted to be crawled"))
    parser.add_argument("-u", "--visit-regex",
                        default="^.+$",
                        help=("A regex which is used over extracted URLs to determine "
                              "if it has been visited. This way a 'visited' mask is formed"))
    parser.add_argument("-e", "--extract-option",
                        action="append",
                        choices=["a-href", "link-href", "script-src", "form-action", "sick-match"],
                        help=("These options present the type of extraction filter used to determine which URLs should be extracted. "
                              "This is used on the crawling phase - if you miss an option you would not visit the kind of URLs. "
                              "The default is: ['a-href']. "
                              "The meaning of options is: 'a-href' - visit URLs gathered from <a href=\"..\"> tags, "
                              "'link-href' - URLs from <link src=\"..\">, "
                              "'form-action' - URLs from <form action=\"..\">, "
                              "'sick-match' - using a sick general regex to finde anything looking like an URL"))
    parser.add_argument("-o", "--output-fields",
                        action="append",
                        choices=["url", "status", "from_main_domain"],
                        help=("This presents the information that is outputed on the screen. "
                              "Default is: ['url', 'status']"))

    args = parser.parse_args()

    return args


def kint_sig_handler(signal, frame):
    print 'You pressed Ctrl+C!'
    sys.exit(0)


def main():
    signal.signal(signal.SIGINT, kint_sig_handler)
    args = get_args()

    url = args.url

    httpauth_creds_str = args.httpauth_creds
    if(httpauth_creds_str):
        try:
            httpauth_creds = json.loads(httpauth_creds_str)
        except Exception, err:
            print >> sys.stderr, "Warning: --httpauth-creds discarded, probably wrong JSON representation - %s for %s" % (str(err), httpauth_creds_str)
    else:
        httpauth_creds = {}

    output_fields = args.output_fields if args.output_fields else ["url", "status"]

    extract_options = args.extract_option if args.extract_option else ["a-href"]

    # Parse go filter
    try:
        go_regex = args.go_regex
        re.compile(go_regex)
    except Exception:
        print >> sys.stderr, "Warning: Discarding --go-filter %s" % args.go_filter_regex
        go_regex = "^.+$"

    # Parse visited filtter
    visit_regex = args.visit_regex
    try:
        re.compile(visit_regex)
    except Exception:
        print >> sys.stderr, "Warning: Discarding --visit-regex"
        visit_regex = "^.+$"

    session_file = args.session_file

    cr_ob = Crawler(url,
                    UrlParser(extract_options),
                    Session(session_file=session_file,
                            visit_regex=visit_regex),
                    go_regex=go_regex,
                    httpauth_creds=httpauth_creds)

    for url_info in cr_ob.start_crawl():
        if(url_info.from_domain):
            s = []
            if('url' in output_fields):
                s.append(url_info.url)
            if('status' in output_fields):
                s.append(str(url_info.status))
            if('from_main_domain' in output_fields):
                s.append(str(url_info.from_domain))
            
            print "    |    ".join(s)


if __name__ == '__main__':
    main()
